---
title: "Cleaning Middle East Values Study: Comparative Panel Survey on the Dynamics of Change: Belief Formation and Political Engagement in Egypt, Tunisia, and Turkey"
---

# Load

```{r}
# load packages
  source("helper-packages.R")

# load
  doc_raw <- 
    read_dta("../raw-data/y-multi-middle-east-values-study/comparative-panel-survey-on-the-dynamics-of-change/PanelData_Dataset_1.0.dta")
```

# Divide into two surveys

This is a panel survey, shaped wide; thus, make 2 objects, one for each time period, and stack.

```{r}
# t1
  doc_t1_clean <- 
    doc_raw %>% 
    select(
      DATE = DATE_T1,
      country,
      UrbanRural_T1,
      V3 = V3_T1,
      V214 = V214_T1, 
      V215 = V215_T1,
      V202 = V202_T1,
      V149 = V149_T1, # neighbor
      V154 = V154_T1,
      V155 = V155_T1,
      V156 = V156_T1,
      V164 = V164_T1, # trust 
      V165 = V165_T1, 
      V168 = V168_T1, 
      V169 = V169_T1,
      V71 = V71_T1, # religiosity
      ) %>% 
    mutate(
      V3 = to_character(V3),
      resp_round = "Wave 1")

# t2
  doc_t2_clean <- 
    doc_raw %>% 
    select(
      DATE = DATE_T2,
      country,
      UrbanRural_T1,
      V3 = V3_T2,
      RESULTCODE_T2,
      V214 = V214_T2,
      V215 = V215_T2,
      V202 = V202_T2,
      V149 = V149_T2,
      V155 = V155_T2, # note, V154 not available for T2
      V156 = V156_T2, 
      V164 = V164_T2, # trust
      V165 = V165_T2, # note: v168 not available for T2
      V169 = V169_T2,
      T2_V207 = T2_V207_TN, # note: gen trust only available for T2
      V71 = V71_T2 # religiosity
      ) %>% 
    mutate(
      V3 = to_character(V3),
      resp_round = "Wave 2") 
  
# stack the dataframes
  stacked_raw <- 
    doc_t1_clean %>% 
    bind_rows(doc_t2_clean)
```

# Date ranges for missing values

```{r}
# foo <- stacked_clean %>% 
#   filter(is.na(resp_interview_date)) %>% distinct(resp_round, resp_country_common)
# 
# for(i in c("Turkey","Egypt","Tunisia")){
#   for (j in c("Wave 1","Wave 2")){
#     foo <- stacked_clean %>% 
#     filter(resp_round == j & resp_country_common == i)
#     min_date <- min(foo$resp_interview_date, na.rm = T)
#     max_date <- max(foo$resp_interview_date, na.rm = T)
#     print(paste(i,j,min_date,max_date))
#   }
# }
```

Some missing dates in each country for each round. The above chunk finds the ranges of dates for these country/waves.

# Clean

```{r}
# declare dates
  dates <- 
    tribble(
      ~resp_country_common, ~resp_round,~resp_interview_start_date, ~resp_interview_end_date,
      "Turkey", "Wave 1", ymd("2013-03-16"),ymd("2013-06-26"),
      "Turkey", "Wave 2", ymd("2016-02-09"), ymd("2016-07-01"),
      "Egypt" ,"Wave 1", ymd("2011-06-12"), ymd("2011-08-07"),
      "Egypt" ,"Wave 2", ymd("2016-08-19"), ymd("2016-12-08"),
      "Tunisia", "Wave 1",ymd("2013-03-15"), ymd("2013-06-02"),
      "Tunisia" ,"Wave 2", ymd("2015-05-04"), ymd("2015-08-21"))

# clean
  stacked_clean <- 
    stacked_raw %>% 
    mutate(
      
    #########################  
    ####### META-DATA #######  
    #########################      
      
      # source name (character vector, title case)
        resp_source = "Comparative Panel Survey on the Dynamics of Change: Belief Formation and Political Engagement in Egypt, Tunisia, and Turkey",
        
      # round number (character vector, title case)  
        resp_round = resp_round,
      
      # url to dataset source, where publicly available (character vector)
        resp_original_data_url = "bit.ly/3lSnAiB",

      # survey mode (in-person/phone/internet)
        resp_survey_mode = 
          case_when(
            RESULTCODE_T2 == 1002 ~ "phone", # this was a panel; some in wave 2 answered by phone
            TRUE ~ "in-person"),     

      # country (character vector; list of countries as written in original source)
        resp_country_original = 
          dplyr::recode(
            as.character(country),
            "788" = "Tunisia",
            "818" = "Egypt",
            "949" = "Turkey"),

      # country (character vector; converts to countrycode county.name list)
        resp_country_common = 
          countryname(resp_country_original),
        
      # interview date (variable of class Date; if only month given, input 1st of month)
        resp_interview_date = DATE) %>%
    left_join(
      dates, by = c("resp_country_common","resp_round")) %>% 
    mutate(
   
    #########################  
    ##### DEMOGRAPHICS ######  
    #########################
      
      # respondent's religion (character vector that corresponds to master list)
        resp_religion = 
          case_when(
            V3 %in% c(
              "Muslim-not specified (option volunteered in Tunisia and Turkey)",
              "Muslim-Sunni",
              "Volunteered: Maliki",
              "Muslim-Shia",
              "Muslim-Alevi",
              "Volunteered: Hanafi",
              "Volunteered: Shafi",
              "Volunteered: Muslim Arab",
              "Volunteered: Tunisian Muslim",
              "Volunteered: Muslim-Nusayri"
              ) ~ "Muslim",
            
            V3 %in% c(
              "Christian"
            ) ~ "Christian",

            V3 %in% c(
              "Jewish"
            ) ~ "Jewish",
            
            V3 %in% c(
              "Volunteered: Deist"
            ) ~ "Other religion",            
                        
            TRUE ~ NA_character_
          ),

      # respondent's religion (character vector that corresponds to master list)
        resp_denomination =
          case_when(
            V3 == "Muslim-Sunni" ~ "Sunni",
            V3 == "Muslim-Shia" ~ "Shia",
            TRUE ~ V3),

      # respondent's age (character vector; bins denoted by single dash ["18-25"])
        resp_age = # significant missingness, but this is a feature of the original data
          dplyr::recode(
            as.character(V214),
            "-4" = NA_character_),

# respondent's education level
        resp_education_original = # same missingness as with age; a large segment of respondents don't have demographics
          dplyr::recode(
            as.character(V202),
            "1" = "1. No formal education [No education]",
            "2" = "2. Incomplete primary school [No education]",
            "3" = "3. Complete primary school [Primary]",
            "4" = "4. Incomplete secondary school: technical/vocational type [Primary]",
            "5" = "5. Complete secondary school: technical/vocational type [Primary]",
            "6" = "6. Incomplete secondary: university-preparatory type [Primary]",
            "7" = "7. Complete secondary: university-preparatory type [Primary]",
            "8" = "8. Some university-level education, without a degree [Primary]",
            "9" = "9. University-level education, with degree (include post-graduate education) [College]",
            .default = NA_character_),       
      
      # respondent's gender (numeric: female = 1; male = 0; other = NA)
        resp_female = # note, unusually, significant missingness in this variable; confirmed in original
          case_when(
            V215 == 5 ~ 1,
            V215 == 1 ~ 0,
            TRUE ~ NA_real_),
      
      # respondent resident in rural (vs urban) area (numeric: rural = 1; urban/semi-urban/peri-urban = 0)
        resp_rural = 
          case_when(
            UrbanRural_T1 == 5 ~ 1,
            UrbanRural_T1 == 1 ~ 0,
            TRUE ~ NA_real_) ,
      
    #########################  
    ### SOCIAL DISTANCE 1 ###  
    #########################
    
      # original question number; question text; response options (input above)
        resp_soc_dist_1_qinfo = "NUM: V149; QTEXT: Do you like or dislike Sunnis as neighbors?; ROPTIONS: 1 = Like [=0] + 2 = Dislike [=1]; TARGET: Sunni; TYPE: Distance, neighbor",
      
      # original response (as character vector)
        resp_soc_dist_1_original = 
          dplyr::recode(
            as.character(V149),
            "-5" = NA_character_,
            "-4" = NA_character_,
            "-2" = NA_character_,
            "-1" = NA_character_),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_1_bin_recode = 
          case_when(
            V149 %in% c(2) ~ 1,
            V149 %in% c(1) ~ 0,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 2 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_2_qinfo = "NUM: V154; QTEXT: Do you like or dislike Shi'is as neighbors?; ROPTIONS: 1 = Like [=0] + 2 = Dislike [=1]; TARGET: Shia; TYPE: Distance, neighbor",
      
      # original response (as character vector)
        resp_soc_dist_2_original =
          case_when(
            resp_country_original == "Egypt" & V154 == 1 ~ "1",
            resp_country_original == "Egypt" & V154 == 2 ~ "2",
            TRUE ~ NA_character_),

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_2_bin_recode = 
          case_when(
            resp_soc_dist_2_original == "2" ~ 1,
            resp_soc_dist_2_original == "1" ~ 0,
            TRUE ~ NA_real_),
            
    #########################  
    ### SOCIAL DISTANCE 3 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_3_qinfo = "NUM: V154; QTEXT: Do you like or dislike Allawis as neighbors?; ROPTIONS: 1 = Like [=0] + 2 = Dislike [=1]; TARGET: Alawi; TYPE: Distance, neighbor",
      
      # original response (as character vector)
        resp_soc_dist_3_original =
          case_when(
            resp_country_original == "Turkey" & V154 == 1 ~ "1",
            resp_country_original == "Turkey" & V154 == 2 ~ "2",
            TRUE ~ NA_character_),

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_3_bin_recode = 
          case_when(
            resp_soc_dist_3_original == "2" ~ 1,
            resp_soc_dist_3_original == "1" ~ 0,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 4 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_4_qinfo = "NUM: V155; QTEXT: Do you like or dislike Jews as neighbors?; ROPTIONS: 1 = Like [=0] + 2 = Dislike [=1]; TARGET: Jewish; TYPE: Distance, neighbor",
      
      # original response (as character vector)
        resp_soc_dist_4_original = 
          dplyr::recode(
            as.character(V155),
            "-5" = NA_character_,
            "-4" = NA_character_,
            "-2" = NA_character_,
            "-1" = NA_character_),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_4_bin_recode = 
          case_when(
            V155 %in% c(2) ~ 1,
            V155 %in% c(1) ~ 0,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 5 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_5_qinfo = "NUM: V156; QTEXT: Do you like or dislike Christians as neighbors?; ROPTIONS: 1 = Like [=0] + 2 = Dislike [=1]; TARGET: Christian; TYPE: Distance, neighbor",
      
      # original response (as character vector)
        resp_soc_dist_5_original = 
          dplyr::recode(
            as.character(V156),
            "-5" = NA_character_,
            "-4" = NA_character_,
            "-2" = NA_character_,
            "-1" = NA_character_),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_5_bin_recode = 
          case_when(
            V156 %in% c(2) ~ 1,
            V156 %in% c(1) ~ 0,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 6 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_6_qinfo = "NUM: V164; QTEXT: I am going to name a number of groups and institutions. For each one, could you tell me how much trust, in general, you have in them: is it a lot, some, only a little, or none at all? Allawi.; ROPTIONS: 1 = A lot [=0] + 2 = Some [=0] + 3 = Only a little [=1] + 4 = Not at all [=1]; TARGET: Alawi; TYPE: Trust",
      
      # original response (as character vector)
        resp_soc_dist_6_original = 
          case_when(
            resp_country_original == "Turkey" & V164 == 1 ~ "1",
            resp_country_original == "Turkey" & V164 == 2 ~ "2",
            resp_country_original == "Turkey" & V164 == 3 ~ "3",
            resp_country_original == "Turkey" & V164 == 4 ~ "4",
            TRUE ~ NA_character_
          ),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_6_bin_recode = 
          case_when(
            resp_country_original == "Turkey" & V164 %in% c(1, 2) ~ 0,
            resp_country_original == "Turkey" & V164 %in% c(3, 4) ~ 1,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 7 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_7_qinfo = "NUM: V164; QTEXT: I am going to name a number of groups and institutions. For each one, could you tell me how much trust, in general, you have in them: is it a lot, some, only a little, or none at all? Shi'a.; ROPTIONS: 1 = A lot [=0] + 2 = Some [=0] + 3 = Only a little [=1] + 4 = Not at all [=1]; TARGET: Shia; TYPE: Trust",
      
      # original response (as character vector)
        resp_soc_dist_7_original = 
          case_when(
            resp_country_original == "Tunisia" & V164 == 1 ~ "1",
            resp_country_original == "Tunisia" & V164 == 2 ~ "2",
            resp_country_original == "Tunisia" & V164 == 3 ~ "3",
            resp_country_original == "Tunisia" & V164 == 4 ~ "4",
            TRUE ~ NA_character_
          ),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_7_bin_recode = 
          case_when(
            resp_country_original == "Tunisia" & V164 %in% c(1, 2) ~ 0,
            resp_country_original == "Tunisia" & V164 %in% c(3, 4) ~ 1,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 8 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_8_qinfo = "NUM: V165; QTEXT: I am going to name a number of groups and institutions. For each one, could you tell me how much trust, in general, you have in them: is it a lot, some, only a little, or none at all? Sunnis.; ROPTIONS: 1 = A lot [=0] + 2 = Some [=0] + 3 = Only a little [=1] + 4 = Not at all [=1]; TARGET: Sunni; TYPE: Trust",
      
      # original response (as character vector)
        resp_soc_dist_8_original = 
          case_when(
            resp_country_original == "Turkey" & V165 == 1 ~ "1",
            resp_country_original == "Turkey" & V165 == 2 ~ "2",
            resp_country_original == "Turkey" & V165 == 3 ~ "3",
            resp_country_original == "Turkey" & V165 == 4 ~ "4",
            TRUE ~ NA_character_
          ),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_8_bin_recode = 
          case_when(
            resp_country_original == "Turkey" & V165 %in% c(1, 2) ~ 0,
            resp_country_original == "Turkey" & V165 %in% c(3, 4) ~ 1,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 9 ###  
    #########################

      # original question number; question text; response options (input above)
        resp_soc_dist_9_qinfo = "NUM: V165; QTEXT: I am going to name a number of groups and institutions. For each one, could you tell me how much trust, in general, you have in them: is it a lot, some, only a little, or none at all? Salafis.; ROPTIONS: 1 = A lot [=0] + 2 = Some [=0] + 3 = Only a little [=1] + 4 = Not at all [=1]; TARGET: Salafi; TYPE: Trust",
      
      # original response (as character vector)
        resp_soc_dist_9_original = 
          case_when(
            resp_country_original == "Tunisia" & V165 == 1 ~ "1",
            resp_country_original == "Tunisia" & V165 == 2 ~ "2",
            resp_country_original == "Tunisia" & V165 == 3 ~ "3",
            resp_country_original == "Tunisia" & V165 == 4 ~ "4",
            TRUE ~ NA_character_
          ),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_9_bin_recode = 
          case_when(
            resp_country_original == "Tunisia" & V165 %in% c(1, 2) ~ 0,
            resp_country_original == "Tunisia" & V165 %in% c(3, 4) ~ 1,
            TRUE ~ NA_real_),

    ##########################  
    ### SOCIAL DISTANCE 10 ###  
    ##########################

      # original question number; question text; response options (input above)
        resp_soc_dist_10_qinfo = "NUM: V168; QTEXT: I am going to name a number of groups and institutions. For each one, could you tell me how much trust, in general, you have in them: is it a lot, some, only a little, or none at all? Jews.; ROPTIONS: 1 = A lot [=0] + 2 = Some [=0] + 3 = Only a little [=1] + 4 = Not at all [=1]; TARGET: Jewish; TYPE: Trust",
      
      # original response (as character vector)
        resp_soc_dist_10_original = 
          case_when(
            V168 == 1 ~ "1",
            V168 == 2 ~ "2",
            V168 == 3 ~ "3",
            V168 == 4 ~ "4",
            TRUE ~ NA_character_
          ),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_10_bin_recode = 
          case_when(
            V168 %in% c(1, 2) ~ 0,
            V168 %in% c(3, 4) ~ 1,
            TRUE ~ NA_real_),

    ##########################  
    ### SOCIAL DISTANCE 11 ###  
    ##########################

      # original question number; question text; response options (input above)
        resp_soc_dist_11_qinfo = "NUM: V169; QTEXT: I am going to name a number of groups and institutions. For each one, could you tell me how much trust, in general, you have in them: is it a lot, some, only a little, or none at all? Christians.; ROPTIONS: 1 = A lot [=0] + 2 = Some [=0] + 3 = Only a little [=1] + 4 = Not at all [=1]; TARGET: Christian; TYPE: Trust",
      
      # original response (as character vector)
        resp_soc_dist_11_original = 
          case_when(
            V169 == 1 ~ "1",
            V169 == 2 ~ "2",
            V169 == 3 ~ "3",
            V169 == 4 ~ "4",
            TRUE ~ NA_character_
          ),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_11_bin_recode = 
          case_when(
            V169 %in% c(1, 2) ~ 0,
            V169 %in% c(3, 4) ~ 1,
            TRUE ~ NA_real_),

   ############################  
   ### GENERAL SOCIAL TRUST ###  
   ############################
    
      # original question number; question text; response options (input above)
        resp_gentrust_qinfo = "NUM: T2.V207; QTEXT: Now I’m going to read you a series of statements about trust. People can be trusted.; ROPTIONS: 1 = Strongly agree [=0] + 2 = Agree [=0] + 3 = Disagree [=1] + 4 = Strongly disagree [=1]", # only for t2

      # original response (as character vector)
        resp_gentrust_original = 
          dplyr::recode(
            as.character(T2_V207),
            "-5" = NA_character_,
            "-4" = NA_character_,
            "-3" = NA_character_,
            "-2" = NA_character_,
            "-1" = NA_character_,
            "8" = NA_character_,
            "9" = NA_character_),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_gentrust_bin_recode = 
          case_when(
            T2_V207 %in% c(1, 2) ~ 0,
            T2_V207 %in% c(3, 4) ~ 1),    
    
    #########################  
    ##### RELIGIOSITY #######  
    #########################
    
      # original question number; question text; response options (input above)
        resp_religiosity_qinfo = "NUM: V71; QTEXT: To what extent do you consider yourself a religious person?; ROPTIONS: scale 1-10, 1 = Not at all religious, 10 = Very religious",
  
      # original response (as numeric vector, with non-substantive responses coded as NA_real_)
        resp_religiosity_original = 
          dplyr::recode(
            as.numeric(V71),
            `-4` = NA_real_,
            `-2` = NA_real_,
            `-1` = NA_real_),       

      # recode (numeric: scaled 0-1, where 1 is more religious)
        resp_religiosity_recode = (resp_religiosity_original - 1)/9
    ) %>% 
    select(starts_with("resp_"))
```

# Data checks

There are highly unusual patterns in several variables concerning willingness to accept neighbors. Check:

```{r}
# dislike sunni neighbors: resp_soc_dist_1_bin_recode
# dislike shia neighbors: resp_soc_dist_2_bin_recode
# dislike allawi neighbors: resp_soc_dist_3_bin_recode
# dislike jews neighbors: resp_soc_dist_4_bin_recode
# dislike christians neighbors: resp_soc_dist_5_bin_recod

# trust in allawis: resp_soc_dist_6_qinfo
# trust in shia: resp_soc_dist_7_bin_recode
# trust in sunni: resp_soc_dist_8_bin_recode
# trust in salafis: resp_soc_dist_9_bin_recode ### note, we do not consider this a clear sect
# trust in jews: resp_soc_dist_10_bin_recode
# trust in christians: resp_soc_dist_11_bin_recode

# see breakdown of means
  checks <- 
    stacked_clean %>%
    mutate(resp_sunni = 
             case_when(
               resp_denomination == "Sunni" ~ "Sunni", 
               resp_denomination != "Sunni" ~ "Non-Sunni", 
               TRUE ~ NA_character_)) %>% 
    group_by(resp_country_common, resp_round, resp_sunni) %>% 
      summarise(
        mean_neighbor_antisunni = mean(resp_soc_dist_1_bin_recode, na.rm = T),
        mean_neighbor_antishia = mean(resp_soc_dist_2_bin_recode, na.rm = T),
        mean_neighbor_antialawi = mean(resp_soc_dist_3_bin_recode, na.rm = T),
        mean_neighbor_antijew = mean(resp_soc_dist_4_bin_recode, na.rm = T),
        mean_neighbor_antichristian = mean(resp_soc_dist_5_bin_recode, na.rm = T),
        
        mean_trust_antishia = mean(resp_soc_dist_7_bin_recode, na.rm = T),
        mean_trust_antisunni = mean(resp_soc_dist_8_bin_recode, na.rm = T),
        mean_trust_antijew = mean(resp_soc_dist_10_bin_recode, na.rm = T),
        mean_trust_antichristian = mean(resp_soc_dist_11_bin_recode, na.rm = T))
```
It seems clear that the neighbors questions have been miscoded: e.g. 96 percent of Sunnis in Turkey are opposed to Sunni neighbors. Keep only trust questions.

# Drop what appear to be erroneously coded variables

```{r}
# drop following conclusions of previous chunk
  final <- 
    stacked_clean %>% 
    select(-c(
      starts_with("resp_soc_dist_1_"), # dislike sunni neighbors
      starts_with("resp_soc_dist_2_"), # dislike shia neighbors
      starts_with("resp_soc_dist_3_"), # dislike allawi neighbors
      starts_with("resp_soc_dist_4_"), # dislike jews neighbors
      starts_with("resp_soc_dist_5_"), # dislike christians neighbors
      starts_with("resp_soc_dist_9_"), # trust in salafis (which we do not consider a sect)
      ))
```

# Save data

```{r}
  saveRDS(final, "../cleaned-data/y-10-multi-middle-east-values-study-cpsdc.rds")
```
